{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "1260ea17-8840-46b7-a208-053c6ed797fa",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.stem import WordNetLemmatizer\n",
    "import nltk\n",
    "import re\n",
    "import os\n",
    "import numpy as np\n",
    "from nltk import download\n",
    "from gensim.models import LdaMulticore, CoherenceModel\n",
    "from gensim.corpora.dictionary import Dictionary\n",
    "from nltk.tokenize import word_tokenize\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "5bcf9e89-6cd1-42d5-82ce-68db4e17e5dd",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('/zfs/disinfo/dcweekly_contrast/story_text/dc_weekly_topic_scores.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "9027a3f8-384d-440c-a702-cf119128fd86",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Make a binary indicator for whether we are in the AI-generated text period of time\n",
    "df['period'] = df.month.apply(lambda x: 1- int(x=='2023-06'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "c96a6ed5-5e6e-4fcd-b22d-78de199ef1d0",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package stopwords to\n",
      "[nltk_data]     /scratch/cehrett/nltk_data...\n",
      "[nltk_data]   Package stopwords is already up-to-date!\n",
      "[nltk_data] Downloading package wordnet to\n",
      "[nltk_data]     /scratch/cehrett/nltk_data...\n",
      "[nltk_data]   Package wordnet is already up-to-date!\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Explicitly set the NLTK data directory\n",
    "nltk_data_dir = os.path.join('/', 'scratch', 'cehrett', 'nltk_data')\n",
    "os.environ['NLTK_DATA'] = nltk_data_dir\n",
    "\n",
    "# Ensure the directory is added to NLTK's search path\n",
    "nltk.data.path.append(nltk_data_dir)\n",
    "\n",
    "# Download the necessary packages\n",
    "nltk.download('stopwords', download_dir=nltk_data_dir)\n",
    "nltk.download('wordnet', download_dir=nltk_data_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "b93a9fab-776b-4793-a4fa-872d4e78b3b0",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package stopwords to\n",
      "[nltk_data]     /scratch/cehrett/nltk_data...\n",
      "[nltk_data]   Package stopwords is already up-to-date!\n",
      "[nltk_data] Downloading package wordnet to\n",
      "[nltk_data]     /scratch/cehrett/nltk_data...\n",
      "[nltk_data]   Package wordnet is already up-to-date!\n",
      "[nltk_data] Downloading package punkt to /scratch/cehrett/nltk_data...\n",
      "[nltk_data]   Package punkt is already up-to-date!\n"
     ]
    }
   ],
   "source": [
    "# Download necessary NLTK data\n",
    "download('stopwords', download_dir=os.path.join('/', 'scratch', 'cehrett', 'nltk_data'))\n",
    "download('wordnet', download_dir=os.path.join('/', 'scratch', 'cehrett', 'nltk_data'))\n",
    "download('punkt', download_dir=os.path.join('/', 'scratch', 'cehrett', 'nltk_data'))\n",
    "\n",
    "# Function to clean and preprocess text\n",
    "def preprocess_text(text):\n",
    "    # Remove HTML tags\n",
    "    text = re.sub(r'<.*?>', '', text)\n",
    "    # Remove punctuation and numbers\n",
    "    text = re.sub(r'[^a-zA-Z\\s]', '', text)\n",
    "    # Lowercase\n",
    "    text = text.lower()\n",
    "    # Remove stopwords\n",
    "    stop_words = set(stopwords.words('english'))\n",
    "    text = ' '.join([word for word in text.split() if word not in stop_words])\n",
    "    # Lemmatization\n",
    "    lemmatizer = WordNetLemmatizer()\n",
    "    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])\n",
    "    return text\n",
    "\n",
    "# Preprocess the text in the DataFrame\n",
    "df['processed_text'] = df['clean_text'].apply(preprocess_text)\n",
    "\n",
    "# Split the DataFrame into two categories based on the 'period' column\n",
    "df_period_0 = df[df['period'] == 0]\n",
    "df_period_1 = df[df['period'] == 1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "de788165-987d-48dd-bf3c-fc4e45a0a176",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/local_scratch/pbs.2108211.pbs02/ipykernel_2337547/2504636202.py:2: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df_period_0['tokens'] = df_period_0['processed_text'].apply(word_tokenize)\n",
      "/local_scratch/pbs.2108211.pbs02/ipykernel_2337547/2504636202.py:3: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df_period_1['tokens'] = df_period_1['processed_text'].apply(word_tokenize)\n"
     ]
    }
   ],
   "source": [
    "# Tokenize the texts\n",
    "df_period_0['tokens'] = df_period_0['processed_text'].apply(word_tokenize)\n",
    "df_period_1['tokens'] = df_period_1['processed_text'].apply(word_tokenize)\n",
    "\n",
    "# Create Dictionaries and Corpora\n",
    "dictionary_0 = Dictionary(df_period_0['tokens'])\n",
    "corpus_0 = [dictionary_0.doc2bow(text) for text in df_period_0['tokens']]\n",
    "\n",
    "dictionary_1 = Dictionary(df_period_1['tokens'])\n",
    "corpus_1 = [dictionary_1.doc2bow(text) for text in df_period_1['tokens']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "b0d3a5d3-c122-4476-88c9-e893278729d0",
   "metadata": {},
   "outputs": [],
   "source": [
    "def compute_coherence_values(dictionary, corpus, texts, start, limit, step):\n",
    "    \"\"\"\n",
    "    Computes coherence scores for LDA models with varying number of topics.\n",
    "    \n",
    "    This function iterates over a specified range of topic numbers, fitting an LDA model for each count,\n",
    "    and calculates the coherence score to evaluate the model's quality. It uses the LdaMulticore class\n",
    "    for efficient parallel computation. Key hyperparameters are chosen to balance model complexity,\n",
    "    convergence speed, and coherence quality:\n",
    "    \n",
    "    - `num_topics`: Varies in steps from `start` to `limit` to explore the optimal number of topics.\n",
    "    - `random_state`: Ensures reproducibility of results.\n",
    "    - `chunksize`: Number of documents to process at a time in the training algorithm. A balance between\n",
    "      memory consumption and computational efficiency.\n",
    "    - `passes`: The number of passes through the corpus during training, for sufficient model convergence.\n",
    "    - `alpha`: Set to 'asymmetric' to account for the intuition that some topics are more probable than others.\n",
    "    - `eta`: Left as `None` to use the default symmetric prior, assuming no prior knowledge about topic-word distribution.\n",
    "    - `decay` and `offset`: Control the learning rate, helping to stabilize early iterations of the model training.\n",
    "    - `eval_every`: Determines how often to calculate log perplexity; set to optimize the trade-off between\n",
    "      training speed and model evaluation.\n",
    "    - `iterations`: Max number of iterations over each document; higher for better model precision.\n",
    "    - `gamma_threshold`: Convergence threshold for gamma, the document-topic density.\n",
    "    - `minimum_probability`: Filters out topics with a probability below this threshold in the topics-per-document distribution.\n",
    "    - `minimum_phi_value`: Lower bound on term probabilities, filtering out terms.\n",
    "    - `per_word_topics`: If True, computes a list of most likely topics for each word, alongside its phi value times feature length.\n",
    "    - `workers`: Utilizes 7 of 8 available cores, leaving one free to prevent resource saturation.\n",
    "    \n",
    "    Parameters:\n",
    "        dictionary (gensim.corpora.dictionary.Dictionary): The dictionary mapping of id->word.\n",
    "        corpus (list of list of (int, float)): Corpus represented as a list of document vectors or sparse matrix.\n",
    "        texts (list of list of str): Text data for coherence calculation; should match `corpus`.\n",
    "        start (int): The starting number of topics.\n",
    "        limit (int): The max number of topics to model.\n",
    "        step (int): The step size to iterate through the number of topics.\n",
    "    \n",
    "    Returns:\n",
    "        model_list (list of gensim.models.LdaMulticore): List of LDA multicore models for each number of topics.\n",
    "        coherence_values (list of float): Coherence values corresponding to each model. \n",
    "        The `c_v` coherence measure is chosen for its balance between interpretability and informativeness. \n",
    "        Unlike other coherence measures that may rely solely on document co-occurrence data (`u_mass`) \n",
    "        or on similarity between top words (`c_uci`, `c_npmi`), `c_v` combines a variety of data sources, \n",
    "        including word co-occurrence and document-level information, to provide a robust assessment of topic quality. \n",
    "        This measure has been shown to correlate well with human judgments of topic coherence, \n",
    "        making it a suitable choice for evaluating the meaningfulness and distinctiveness of the topics discovered by LDA models. \n",
    "        Additionally, `c_v` allows for the incorporation of sliding window techniques, \n",
    "        which can capture more nuanced semantic relationships between words, further enhancing the evaluation of topic coherence.\n",
    "\n",
    "    \"\"\"\n",
    "    coherence_values = []\n",
    "    model_list = []\n",
    "    for num_topics in range(start, limit, step):\n",
    "        print(f'Beginning LDA with {num_topics} topics')\n",
    "        model = LdaMulticore(corpus=corpus,\n",
    "                             id2word=dictionary,\n",
    "                             num_topics=num_topics,\n",
    "                             random_state=100,\n",
    "                             chunksize=100,\n",
    "                             passes=10,\n",
    "                             alpha='asymmetric',\n",
    "                             eta=None,\n",
    "                             decay=0.5,\n",
    "                             offset=64,\n",
    "                             eval_every=10,\n",
    "                             iterations=400,\n",
    "                             gamma_threshold=0.001,\n",
    "                             minimum_probability=0.01,\n",
    "                             minimum_phi_value=0.01,\n",
    "                             per_word_topics=True,\n",
    "                             workers=5)  # Utilize 7 of 8 cores for efficiency\n",
    "        model_list.append(model)\n",
    "        print('LDA model fitted. Beginning coherence calculation')\n",
    "        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')\n",
    "        coherence_values.append(coherencemodel.get_coherence())\n",
    "\n",
    "    return model_list, coherence_values\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "808621a9-2995-4f44-b792-09ce08c32b1b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Beginning LDA with 4 topics\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Process ForkPoolWorker-622:\n",
      "Process ForkPoolWorker-626:\n",
      "Traceback (most recent call last):\n",
      "Traceback (most recent call last):\n",
      "  File \"/home/cehrett/.conda/envs/catching_trolls/lib/python3.9/multiprocessing/process.py\", line 315, in _bootstrap\n",
      "    self.run()\n"
     ]
    },
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[10], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m# Compute coherence values for each set of topics\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m model_list_0, coherence_values_0 \u001b[38;5;241m=\u001b[39m \u001b[43mcompute_coherence_values\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdictionary\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdictionary_0\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcorpus\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcorpus_0\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtexts\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdf_period_0\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtokens\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstart\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m4\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlimit\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m12\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstep\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m4\u001b[39;49m\u001b[43m)\u001b[49m\n",
      "Cell \u001b[0;32mIn[8], line 52\u001b[0m, in \u001b[0;36mcompute_coherence_values\u001b[0;34m(dictionary, corpus, texts, start, limit, step)\u001b[0m\n\u001b[1;32m     50\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m num_topics \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(start, limit, step):\n\u001b[1;32m     51\u001b[0m     \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mBeginning LDA with \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mnum_topics\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m topics\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m---> 52\u001b[0m     model \u001b[38;5;241m=\u001b[39m \u001b[43mLdaMulticore\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcorpus\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcorpus\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     53\u001b[0m \u001b[43m                         \u001b[49m\u001b[43mid2word\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdictionary\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     54\u001b[0m \u001b[43m                         \u001b[49m\u001b[43mnum_topics\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mnum_topics\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     55\u001b[0m \u001b[43m                         \u001b[49m\u001b[43mrandom_state\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m100\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     56\u001b[0m \u001b[43m                         \u001b[49m\u001b[43mchunksize\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m100\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     57\u001b[0m \u001b[43m                         \u001b[49m\u001b[43mpasses\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m10\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     58\u001b[0m \u001b[43m                         \u001b[49m\u001b[43malpha\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43masymmetric\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     59\u001b[0m \u001b[43m                         \u001b[49m\u001b[43meta\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m     60\u001b[0m \u001b[43m                         \u001b[49m\u001b[43mdecay\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.5\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     61\u001b[0m \u001b[43m                         \u001b[49m\u001b[43moffset\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m64\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     62\u001b[0m \u001b[43m                         \u001b[49m\u001b[43meval_every\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m10\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     63\u001b[0m \u001b[43m                         \u001b[49m\u001b[43miterations\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m400\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     64\u001b[0m \u001b[43m                         \u001b[49m\u001b[43mgamma_threshold\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.001\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     65\u001b[0m \u001b[43m                         \u001b[49m\u001b[43mminimum_probability\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.01\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     66\u001b[0m \u001b[43m                         \u001b[49m\u001b[43mminimum_phi_value\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.01\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     67\u001b[0m \u001b[43m                         \u001b[49m\u001b[43mper_word_topics\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m     68\u001b[0m \u001b[43m                         \u001b[49m\u001b[43mworkers\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m5\u001b[39;49m\u001b[43m)\u001b[49m  \u001b[38;5;66;03m# Utilize 7 of 8 cores for efficiency\u001b[39;00m\n\u001b[1;32m     69\u001b[0m     model_list\u001b[38;5;241m.\u001b[39mappend(model)\n\u001b[1;32m     70\u001b[0m     \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mLDA model fitted. Beginning coherence calculation\u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
      "File \u001b[0;32m~/.conda/envs/catching_trolls/lib/python3.9/site-packages/gensim/models/ldamulticore.py:186\u001b[0m, in \u001b[0;36mLdaMulticore.__init__\u001b[0;34m(self, corpus, num_topics, id2word, workers, chunksize, passes, batch, alpha, eta, decay, offset, eval_every, iterations, gamma_threshold, random_state, minimum_probability, minimum_phi_value, per_word_topics, dtype)\u001b[0m\n\u001b[1;32m    183\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(alpha, \u001b[38;5;28mstr\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m alpha \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mauto\u001b[39m\u001b[38;5;124m'\u001b[39m:\n\u001b[1;32m    184\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mauto-tuning alpha not implemented in LdaMulticore; use plain LdaModel.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 186\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mLdaMulticore\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[1;32m    187\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcorpus\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcorpus\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnum_topics\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mnum_topics\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    188\u001b[0m \u001b[43m    \u001b[49m\u001b[43mid2word\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mid2word\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mchunksize\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mchunksize\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpasses\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpasses\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43malpha\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43malpha\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43meta\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43meta\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    189\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdecay\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdecay\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moffset\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moffset\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43meval_every\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43meval_every\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43miterations\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43miterations\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    190\u001b[0m \u001b[43m    \u001b[49m\u001b[43mgamma_threshold\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgamma_threshold\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrandom_state\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrandom_state\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mminimum_probability\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mminimum_probability\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    191\u001b[0m \u001b[43m    \u001b[49m\u001b[43mminimum_phi_value\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mminimum_phi_value\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mper_word_topics\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mper_word_topics\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    192\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/.conda/envs/catching_trolls/lib/python3.9/site-packages/gensim/models/ldamodel.py:521\u001b[0m, in \u001b[0;36mLdaModel.__init__\u001b[0;34m(self, corpus, num_topics, id2word, distributed, chunksize, passes, update_every, alpha, eta, decay, offset, eval_every, iterations, gamma_threshold, minimum_probability, random_state, ns_conf, minimum_phi_value, per_word_topics, callbacks, dtype)\u001b[0m\n\u001b[1;32m    519\u001b[0m use_numpy \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdispatcher \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m    520\u001b[0m start \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n\u001b[0;32m--> 521\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mupdate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcorpus\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mchunks_as_numpy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_numpy\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    522\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39madd_lifecycle_event(\n\u001b[1;32m    523\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcreated\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m    524\u001b[0m     msg\u001b[38;5;241m=\u001b[39m\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtrained \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m in \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtime\u001b[38;5;241m.\u001b[39mtime()\u001b[38;5;250m \u001b[39m\u001b[38;5;241m-\u001b[39m\u001b[38;5;250m \u001b[39mstart\u001b[38;5;132;01m:\u001b[39;00m\u001b[38;5;124m.2f\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124ms\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m    525\u001b[0m )\n",
      "File \u001b[0;32m~/.conda/envs/catching_trolls/lib/python3.9/site-packages/gensim/models/ldamulticore.py:316\u001b[0m, in \u001b[0;36mLdaMulticore.update\u001b[0;34m(self, corpus, chunks_as_numpy)\u001b[0m\n\u001b[1;32m    312\u001b[0m \u001b[38;5;66;03m# endfor single corpus pass\u001b[39;00m\n\u001b[1;32m    313\u001b[0m \n\u001b[1;32m    314\u001b[0m \u001b[38;5;66;03m# wait for all outstanding jobs to finish\u001b[39;00m\n\u001b[1;32m    315\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m queue_size[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[0;32m--> 316\u001b[0m     \u001b[43mprocess_result_queue\u001b[49m\u001b[43m(\u001b[49m\u001b[43mforce\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m    318\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m reallen \u001b[38;5;241m!=\u001b[39m lencorpus:\n\u001b[1;32m    319\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minput corpus size changed during training (don\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt use generators as input)\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
      "File \u001b[0;32m~/.conda/envs/catching_trolls/lib/python3.9/site-packages/gensim/models/ldamulticore.py:274\u001b[0m, in \u001b[0;36mLdaMulticore.update.<locals>.process_result_queue\u001b[0;34m(force)\u001b[0m\n\u001b[1;32m    268\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m    269\u001b[0m \u001b[38;5;124;03mClear the result queue, merging all intermediate results, and update the\u001b[39;00m\n\u001b[1;32m    270\u001b[0m \u001b[38;5;124;03mLDA model if necessary.\u001b[39;00m\n\u001b[1;32m    271\u001b[0m \n\u001b[1;32m    272\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m    273\u001b[0m merged_new \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[0;32m--> 274\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[43mresult_queue\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mempty\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m    275\u001b[0m     other\u001b[38;5;241m.\u001b[39mmerge(result_queue\u001b[38;5;241m.\u001b[39mget())\n\u001b[1;32m    276\u001b[0m     queue_size[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n",
      "File \u001b[0;32m~/.conda/envs/catching_trolls/lib/python3.9/multiprocessing/queues.py:129\u001b[0m, in \u001b[0;36mQueue.empty\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    128\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mempty\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m--> 129\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_poll\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/.conda/envs/catching_trolls/lib/python3.9/multiprocessing/connection.py:257\u001b[0m, in \u001b[0;36m_ConnectionBase.poll\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    255\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_closed()\n\u001b[1;32m    256\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_readable()\n\u001b[0;32m--> 257\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_poll\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/.conda/envs/catching_trolls/lib/python3.9/multiprocessing/connection.py:424\u001b[0m, in \u001b[0;36mConnection._poll\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    423\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_poll\u001b[39m(\u001b[38;5;28mself\u001b[39m, timeout):\n\u001b[0;32m--> 424\u001b[0m     r \u001b[38;5;241m=\u001b[39m \u001b[43mwait\u001b[49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    425\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mbool\u001b[39m(r)\n",
      "File \u001b[0;32m~/.conda/envs/catching_trolls/lib/python3.9/multiprocessing/connection.py:923\u001b[0m, in \u001b[0;36mwait\u001b[0;34m(object_list, timeout)\u001b[0m\n\u001b[1;32m    917\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwait\u001b[39m(object_list, timeout\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[1;32m    918\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m'''\u001b[39;00m\n\u001b[1;32m    919\u001b[0m \u001b[38;5;124;03m    Wait till an object in object_list is ready/readable.\u001b[39;00m\n\u001b[1;32m    920\u001b[0m \n\u001b[1;32m    921\u001b[0m \u001b[38;5;124;03m    Returns list of those objects in object_list which are ready/readable.\u001b[39;00m\n\u001b[1;32m    922\u001b[0m \u001b[38;5;124;03m    '''\u001b[39;00m\n\u001b[0;32m--> 923\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m \u001b[43m_WaitSelector\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m selector:\n\u001b[1;32m    924\u001b[0m         \u001b[38;5;28;01mfor\u001b[39;00m obj \u001b[38;5;129;01min\u001b[39;00m object_list:\n\u001b[1;32m    925\u001b[0m             selector\u001b[38;5;241m.\u001b[39mregister(obj, selectors\u001b[38;5;241m.\u001b[39mEVENT_READ)\n",
      "File \u001b[0;32m~/.conda/envs/catching_trolls/lib/python3.9/selectors.py:350\u001b[0m, in \u001b[0;36m_PollLikeSelector.__init__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    348\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m    349\u001b[0m     \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__init__\u001b[39m()\n\u001b[0;32m--> 350\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_selector \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_selector_cls\u001b[49m()\n",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  File \"/home/cehrett/.conda/envs/catching_trolls/lib/python3.9/multiprocessing/process.py\", line 315, in _bootstrap\n",
      "    self.run()\n",
      "  File \"/home/cehrett/.conda/envs/catching_trolls/lib/python3.9/multiprocessing/process.py\", line 108, in run\n",
      "    self._target(*self._args, **self._kwargs)\n",
      "  File \"/home/cehrett/.conda/envs/catching_trolls/lib/python3.9/multiprocessing/process.py\", line 108, in run\n",
      "    self._target(*self._args, **self._kwargs)\n",
      "  File \"/home/cehrett/.conda/envs/catching_trolls/lib/python3.9/multiprocessing/pool.py\", line 109, in worker\n",
      "    initializer(*initargs)\n",
      "  File \"/home/cehrett/.conda/envs/catching_trolls/lib/python3.9/multiprocessing/pool.py\", line 109, in worker\n",
      "    initializer(*initargs)\n",
      "  File \"/home/cehrett/.conda/envs/catching_trolls/lib/python3.9/site-packages/gensim/models/ldamulticore.py\", line 341, in worker_e_step\n",
      "    chunk_no, chunk, w_state = input_queue.get()\n",
      "  File \"/home/cehrett/.conda/envs/catching_trolls/lib/python3.9/site-packages/gensim/models/ldamulticore.py\", line 341, in worker_e_step\n",
      "    chunk_no, chunk, w_state = input_queue.get()\n",
      "  File \"/home/cehrett/.conda/envs/catching_trolls/lib/python3.9/multiprocessing/queues.py\", line 103, in get\n",
      "    res = self._recv_bytes()\n",
      "  File \"/home/cehrett/.conda/envs/catching_trolls/lib/python3.9/multiprocessing/queues.py\", line 102, in get\n",
      "    with self._rlock:\n",
      "  File \"/home/cehrett/.conda/envs/catching_trolls/lib/python3.9/multiprocessing/connection.py\", line 216, in recv_bytes\n",
      "    buf = self._recv_bytes(maxlength)\n",
      "  File \"/home/cehrett/.conda/envs/catching_trolls/lib/python3.9/multiprocessing/synchronize.py\", line 95, in __enter__\n",
      "    return self._semlock.__enter__()\n",
      "  File \"/home/cehrett/.conda/envs/catching_trolls/lib/python3.9/multiprocessing/connection.py\", line 414, in _recv_bytes\n",
      "    buf = self._recv(4)\n",
      "  File \"/home/cehrett/.conda/envs/catching_trolls/lib/python3.9/multiprocessing/connection.py\", line 379, in _recv\n",
      "    chunk = read(handle, remaining)\n",
      "KeyboardInterrupt\n",
      "KeyboardInterrupt\n"
     ]
    }
   ],
   "source": [
    "# Compute coherence values for each set of topics\n",
    "model_list_0, coherence_values_0 = compute_coherence_values(dictionary=dictionary_0, corpus=corpus_0, texts=df_period_0['tokens'], start=4, limit=12, step=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4eef5672-36a7-45a5-9461-070501b55041",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Compute coherence values for each set of topics\n",
    "model_list_1, coherence_values_1 = compute_coherence_values(dictionary=dictionary_1, corpus=corpus_1, texts=df_period_1['tokens'], start=4, limit=101, step=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c6c554b7-d50c-4adc-aeed-f213672786a2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Plot coherence scores\n",
    "topic_numbers = list(range(4, 101, 4))\n",
    "\n",
    "# Create a figure with two subplots\n",
    "fig, axs = plt.subplots(1, 2, figsize=(8, 3), sharey=True)\n",
    "\n",
    "# Plot coherence values for each category\n",
    "axs[0].plot(topic_numbers, coherence_values_0, marker='o', linestyle='-', color='b')\n",
    "axs[0].set_title('June')\n",
    "axs[0].set_xlabel('Number of Topics')\n",
    "axs[0].set_ylabel('Coherence Score')\n",
    "\n",
    "axs[1].plot(topic_numbers, coherence_values_1, marker='o', linestyle='-', color='r')\n",
    "axs[1].set_title('Oct/Nov')\n",
    "axs[1].set_xlabel('Number of Topics')\n",
    "# axs[1].set_ylabel('Coherence Score')  # Shared y-axis\n",
    "\n",
    "plt.suptitle('Coherence Scores for LDA Models')\n",
    "plt.tight_layout(rect=[0, 0.03, 1, 0.95]) # Adjust layout to make room for the main title\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "75a2c5df-77c0-406b-bb11-345d606a80c4",
   "metadata": {},
   "outputs": [],
   "source": [
    "def calculate_entropy(model, corpus):\n",
    "    \"\"\"\n",
    "    Calculate the entropy of the topic distribution for a given LDA model and corpus.\n",
    "    \n",
    "    Entropy is a measure of the unpredictability or disorder within the topic distribution\n",
    "    for each document in the corpus, and this function calculates the average entropy across\n",
    "    all documents to provide a measure of the overall diversity of topics within the corpus.\n",
    "    \n",
    "    Parameters:\n",
    "    - model: An instance of LdaModel trained on the corpus.\n",
    "    - corpus: The corpus on which the model is trained, in the gensim bag-of-words format.\n",
    "    \n",
    "    Returns:\n",
    "    - avg_entropy: The average entropy of the topic distribution across all documents in the corpus.\n",
    "    \"\"\"\n",
    "    entropy_list = []\n",
    "    for document in corpus:\n",
    "        topic_distribution = model.get_document_topics(document, minimum_probability=0)\n",
    "        probabilities = np.array([probability for _, probability in topic_distribution])\n",
    "        entropy = -np.sum(probabilities * np.log(probabilities))\n",
    "        entropy_list.append(entropy)\n",
    "    \n",
    "    avg_entropy = np.mean(entropy_list)\n",
    "    return avg_entropy\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "id": "001b725f-ca53-4307-af14-f20a6f92244a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.34642845, 0.6200208)"
      ]
     },
     "execution_count": 73,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Calculate the entropy of the topic distribution\n",
    "entropy_category_0 = calculate_entropy(model_list_0[7], corpus_0)\n",
    "entropy_category_1 = calculate_entropy(model_list_1[7], corpus_1)\n",
    "\n",
    "entropy_category_0, entropy_category_1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "id": "0b0184c2-27a9-4436-a91f-3537929afe4b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(0.22206607, 0.2923892),\n",
       " (0.33872557, 0.39454624),\n",
       " (0.35797223, 0.52606493),\n",
       " (0.3917019, 0.4703948),\n",
       " (0.3859179, 0.54999864),\n",
       " (0.32285845, 0.55561155),\n",
       " (0.30867648, 0.6519205),\n",
       " (0.34643093, 0.6199593),\n",
       " (0.4201682, 0.6680089),\n",
       " (0.40014446, 0.6642104),\n",
       " (0.3644532, 0.69852495),\n",
       " (0.35679424, 0.74922407)]"
      ]
     },
     "execution_count": 76,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Calculate entropies for all numbers of topics\n",
    "entropies = [(calculate_entropy(model_list_0[i], corpus_0),calculate_entropy(model_list_1[i], corpus_1)) for i in range(len(model_list_0))]\n",
    "entropies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "id": "4fd44eb0-a6c2-49e5-9ea4-85dfffdd6157",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "([1, 2, 3, 4, 1, 2, 3, 4], [1, 2, 3, 4, 1, 2, 3, 4])"
      ]
     },
     "execution_count": 80,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def ff():\n",
    "    return [1,2,3,4],[1,2,3,4]\n",
    "aa,bb = ff()\n",
    "a+=aa\n",
    "b+=bb\n",
    "a,b"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "catching_trolls",
   "language": "python",
   "name": "catching_trolls"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
